M1 Lab

Matthew Præstgaard

due 01/31/2024

All files can be found at https://github.com/matthewep528/BIOSTLabs

library(tidyverse)
library(readxl)
library(car)
library(plotly)
library(ggthemr)
ggthemr("flat dark")
library(gridExtra)
library(ggh4x)
library(kableExtra)
library(jtools)

Housekeeping

Functions

# use to plot linear model with summary statistics and error
# Usage: ggRegression(data, title = "title" (optional))
ggRegression <- function(fit, title = "title") {
  require(ggplot2)

  ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
    geom_point() +
    stat_smooth(
      method = "lm",
      geom = "smooth"
    ) +
    labs(
      title = title,
      subtitle = paste(
        "Adj R2 = ", signif(summary(fit)$adj.r.squared, 5),
        "Intercept =", signif(fit$coef[[1]], 5),
        " Slope =", signif(fit$coef[[2]], 5),
        " P =", signif(summary(fit)$coef[2, 4], 5)
      )
    )
}
# plotlyRegression: Scatterplot with line, SE, and summary statistics
# requires a model to have been created beforehand
# Usage: plotlyRegression(fit, title = "title" (optional))
plotlyRegression <- function(fit, title = "title") {
  require(plotly)

  # Extract model data
  df <- fit$model
  x_var <- names(df)[2]
  y_var <- names(df)[1]

  # Create a scatter plot
  p <- plot_ly(df, x = ~ df[[x_var]], y = ~ df[[y_var]], type = "scatter", mode = "markers")

  # Get confidence interval data
  confint_df <- data.frame(predict(fit, interval = "confidence"))
  confint_df[[x_var]] <- df[[x_var]] # Adding the x variable to the dataframe

  # Add linear regression line and 95% confidence interval
  p <- p %>%
    add_lines(
      x = confint_df[[x_var]],
      y = confint_df$fit,
      line = list(color = "#0aca8d")
    ) %>%
    add_ribbons(
      x = confint_df[[x_var]],
      ymin = confint_df$lwr,
      ymax = confint_df$upr,
      line = list(color = "transparent"),
      fillcolor = "rgba(0, 100, 255, 0.2)"
    )

  # Create subtitle with summary statistics
  subtitle <- paste(
    "Adj R2 =", signif(summary(fit)$adj.r.squared, 5),
    ", Intercept =", signif(fit$coef[[1]], 5),
    ", Slope =", signif(fit$coef[[2]], 5),
    ", P =", signif(summary(fit)$coef[2, 4], 5)
  )

  # Add layout with combined title and subtitle
  p <- p %>%
    layout(
      title = list(
        text = paste0(title, "<br><sup>", subtitle, "</sup>"),
        font = list(size = 16)
      ),
      plot_bgcolor = "#2c2c2c",
      paper_bgcolor = "#2c2c2c",
      font = list(color = "#ffffff"),
      xaxis = list(title = x_var, color = "#ffffff"),
      yaxis = list(title = y_var, color = "#ffffff")
    )

  return(p)
}
# Function to summarize variables with optional grouping
# Usage: summarize.var(dataset, var, group_var (optional))
summarize.var <- function(dataset, var, group_var = NULL) {
  require(tidyverse)
  require(rlang)

  result <- dataset %>%
    {
      if (!is.null(group_var)) dplyr::group_by({{ group_var }}) else .
    } %>%
    summarise(
      mean = mean({{ var }}, na.rm = TRUE),
      median = median({{ var }}, na.rm = TRUE),
      min = min({{ var }}, na.rm = TRUE),
      max = max({{ var }}, na.rm = TRUE),
      q25 = quantile({{ var }}, 0.25, na.rm = TRUE), # first quartile
      q75 = quantile({{ var }}, 0.75, na.rm = TRUE), # third quartile
      sd = sd({{ var }}, na.rm = TRUE),
      n = length(na.omit({{ var }}))
    ) %>%
    mutate(IQR = q75 - q25)

  return(result)
}

Data Management

data <- read_xlsx("hwdata1.xlsx") %>%
  mutate(spbl = as.factor(spbl)) # changing spb1 to factor
kable(head(data), caption = "Table 1.")
Table 1.
HDL cholesterol triglyceride spbl
47 287 111 0
38 236 135 0
47 255 98 0
39 135 63 0
44 121 46 0
64 171 103 0
HDLsum <- summarize.var(data, HDL)
Cholsum <- summarize.var(data, cholesterol)
Trisum <- summarize.var(data, triglyceride)

summary <- rbind(HDLsum, Cholsum, Trisum)
summary$Variable <- c("HDL", "Cholesterol", "Triglyceride")
summary <- summary %>%
  select(Variable, everything()) %>%
  data.frame()
kable(summary, caption = "Table 2. Summary Statistics of Continuous Variables")
Table 2. Summary Statistics of Continuous Variables
Variable mean median min max q25 q75 sd n IQR
HDL 47.7619 46.5 27 76 40 56.00 10.60789 42 16.00
Cholesterol 267.8095 263.5 121 397 248 293.25 60.40321 42 45.25
Triglyceride 155.0476 140.0 46 408 103 179.00 73.75039 42 76.00

Separate SLR Models

We will start by fitting separate models to see if any of the thre predictors of interest are associated with the outcome.

Part 1) Fitting SLR models

Fit separate simple linear regression models for assessing the association of total cholesterol level (cholesterol), total triglyceride level (triglyceride) and sinking pre-beta lipoprotein (spbl) with HDL (HDL) (mg/dL). Write the estimated regression equation and interpret the slope coefficient for each of the three models.

# model with just cholesterol
lmChol <- lm(HDL ~ cholesterol, data = data)
summ(lmChol)
Observations 42
Dependent variable HDL
Type OLS linear regression
F(1,40) 0.40
0.01
Adj. R² -0.01
Est. S.E. t val. p
(Intercept) 52.47 7.58 6.92 0.00
cholesterol -0.02 0.03 -0.64 0.53
Standard errors: OLS
# model with just triglycerides
lmTri <- lm(HDL ~ triglyceride, data = data)
summ(lmTri)
Observations 42
Dependent variable HDL
Type OLS linear regression
F(1,40) 0.19
0.00
Adj. R² -0.02
Est. S.E. t val. p
(Intercept) 46.25 3.89 11.90 0.00
triglyceride 0.01 0.02 0.43 0.67
Standard errors: OLS
# model with just sinking pre-beta
lmSPBL <- lm(HDL ~ spbl, data = data)
summ(lmSPBL)
Observations 42
Dependent variable HDL
Type OLS linear regression
F(1,40) 7.58
0.16
Adj. R² 0.14
Est. S.E. t val. p
(Intercept) 43.77 2.10 20.85 0.00
spbl1 8.38 3.04 2.75 0.01
Standard errors: OLS
plotlyRegression(lmChol, title = "Fig 1. Plot of HDL by Cholesterol.")
plotlyRegression(lmTri, title = "Fig 2. Plot of HDL by Triglycerides.")
getRegressionEquation <- function(model, responseVar, predictorVar) {
  coefficients <- coef(model)
  equation <- sprintf("%s = %.2f + %.2f * %s", responseVar, coefficients[1], coefficients[2], predictorVar)
  return(equation)
}

equationTri <- getRegressionEquation(lmTri, "HDL", "triglyceride")
equationChol <- getRegressionEquation(lmChol, "HDL", "cholesterol")
equationSPBL <- getRegressionEquation(lmSPBL, "HDL", "spbl")

Cholesterol
\[\displaystyle{\displaylines{HDL = 52.47 + -0.02 * cholesterol}}\] Triglyceride
\[\displaystyle{\displaylines{HDL = 46.25 + 0.01 * triglyceride}}\] Sinking Pre-Beta
\[\displaystyle{\displaylines{HDL = 43.77 + 8.38 * spbl}}\]

The slopes of the first two models mean that for every one unit increase in cholesterol/triglyceride, HDL decreases by -0.02 and increases by 0.01, respectively. The coefficient of third model is the expected difference in HDL whether spbl is present or absent.

Part 2) Hypothesis testing in SLR models

Using the separate models fit in part 1, test whether total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein alone statistically significantly predict y. Interpret your answer.

As seen in the output results above, only spbl statistically significantly predicts HDL (p = 0.01).

# show test/model output here

MLR models

We now wish to improve our estimation of HDL by incorporating multiple predictors in the same model. This will also allow us to assess possible confounding and effect modification.

Part 3) MLR model equation

Write the theoretical (population) regression equation that would be used to test whether total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein taken together statistically significantly improve prediction of HDL. Interpret the coefficients in this model.

Model:

\[ \displaystyle{\displaylines{E(HDL) = \beta_{0} +\beta_{1}(Cholesterol)+ \beta_{2}(Triglyceride)+\beta_{3}(SPBl)}} \]

Interpretations:

β₀ (Intercept): Expected HDL when cholesterol, triglyceride, and SPBL are zero. β₁ (Cholesterol): Change in HDL per unit increase in cholesterol, holding others constant. β₂ (Triglyceride): Change in HDL per unit increase in triglyceride, holding others constant. β₃ (SPBL): Difference in HDL between presence and absence of SPBL, holding others constant.

Part 4) Fitting a MLR model and performing a global test

Fit the model specified above and test if total cholesterol level, triglyceride level, and sinking pre-beta lipoprotein taken together statistically significantly improve prediction of HDL. Interpret your answer.

# fit MLR model
MLR <- lm(HDL ~ cholesterol + triglyceride + spbl, data = data)
summ(MLR)
Observations 42
Dependent variable HDL
Type OLS linear regression
F(3,38) 2.74
0.18
Adj. R² 0.11
Est. S.E. t val. p
(Intercept) 48.86 7.25 6.74 0.00
cholesterol -0.03 0.03 -0.91 0.37
triglyceride 0.02 0.02 0.61 0.55
spbl1 8.15 3.11 2.62 0.01
Standard errors: OLS
## make sure to show the output for the model

Part 5) Variable added test

Fit a multivariable model and test whether sinking pre-beta (spbl) is associated with HDL after the combined contribution of cholesterol and triglyceride levels is taken into account (i.e., cholesterol and triglycerides are already in the model). Assume no interactions exist. State the appropriate null hypothesis for this test and interpret the results of the test.

aovSPB <- anova(lmSPBL, MLR)
kable(aovSPB)
Res.Df RSS Df Sum of Sq F Pr(>F)
40 3878.414 NA NA NA NA
38 3793.872 2 84.54191 0.4233924 0.6578727

Null hypothesis: Sinking pre-beta is not associated with HDL after accounting for cholesterol and triglyceride levels.

Test results and interpretation: p = 0.66
We conclude that, after accounting for cholesterol and triglyceride levels, sinking pre-beta is not associated with HDL.

Part 6) Model for effect modification

Suppose we are interested in evaluating if the association between cholesterol and HDL and the association between triglycerides and HDL depends on the absence/presence of SPBL. Write down the model statement for the model you need to fit to assess this and interpret the coefficients in this model.

Model:
\[\displaystyle{\displaylines{HDL = \beta_0 + \beta_1 \cdot \text{Cholesterol} + \beta_2 \cdot \text{Triglyceride} + \beta_3 \cdot \text{SPBL} + \beta_4 \cdot (\text{Cholesterol} \times \text{SPBL}) + \beta_5 \cdot (\text{Triglyceride} \times \text{SPBL}) + \epsilon}}\]

Interpretation of coefficients:
β₀ (Intercept): Expected HDL level when cholesterol, triglycerides, and SPBL are all zero.
β₁ (Cholesterol): Change in HDL for a one-unit increase in cholesterol, assuming SPBL is absent.
β₂ (Triglyceride): Change in HDL for a one-unit increase in triglycerides, assuming SPBL is absent.
β₃ (SPBL): Difference in HDL when SPBL is present versus absent, keeping other variables constant.
β₄ (Cholesterol × SPBL): Change in the effect of cholesterol on HDL due to the presence of SPBL.
β₅ (Triglyceride × SPBL): Change in the effect of triglycerides on HDL due to the presence of SPBL.

Part 7) Testing for effect modification

Fit a multivariable model and test whether the interactions of cholesterol with SPBL and triglycerides with SPBL are simultaneously equal to zero in a model already containing the three main effects for cholesterol, triglycerides, and SPBL. State the null hypothesis of the test. Given the result of the test, what can you conclude about the relationship of HDL to both cholesterol and triglycerides (hint: remember what an interaction means)?

# code to fit model
spblEM <- lm(HDL ~ cholesterol + triglyceride + spbl + cholesterol:spbl + triglyceride:spbl, data = data)
# test
summ(spblEM)
Observations 42
Dependent variable HDL
Type OLS linear regression
F(5,36) 1.73
0.19
Adj. R² 0.08
Est. S.E. t val. p
(Intercept) 52.33 9.26 5.65 0.00
cholesterol -0.05 0.04 -1.22 0.23
triglyceride 0.03 0.05 0.68 0.50
spbl1 -2.08 14.96 -0.14 0.89
cholesterol:spbl1 0.05 0.06 0.85 0.40
triglyceride:spbl1 -0.03 0.06 -0.50 0.62
Standard errors: OLS
summary(spblEM)
## 
## Call:
## lm(formula = HDL ~ cholesterol + triglyceride + spbl + cholesterol:spbl + 
##     triglyceride:spbl, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4106  -8.5894  -0.6099   5.9938  23.5845 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        52.33466    9.25549   5.654 2.01e-06 ***
## cholesterol        -0.04956    0.04056  -1.222    0.230    
## triglyceride        0.03188    0.04703   0.678    0.502    
## spbl1              -2.07934   14.95830  -0.139    0.890    
## cholesterol:spbl1   0.05439    0.06414   0.848    0.402    
## triglyceride:spbl1 -0.02822    0.05610  -0.503    0.618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.16 on 36 degrees of freedom
## Multiple R-squared:  0.1939, Adjusted R-squared:  0.08194 
## F-statistic: 1.732 on 5 and 36 DF,  p-value: 0.1523

Null hypothesis: \[\displaystyle{\displaylines{β(cholesterol):SPBL=β(triglyceride):SPBL=0}}\]

Interpretation: p = 0.1523 From this p-value, we fail to reject the null hypothesis that the interactions between cholesterol and SPBL and between triglycerides and SPBL are not significantly different from zero.

Part 8) Checking for confounding

For sinking pre-beta lipoprotein (spbl), compare the coefficient for spbl in the full model (in part 4) to the simple model in parts 1 and 2. Do you think there is confounding due to cholesterol and triglycerides? Explain.

simpleModel <- lm(HDL ~ spbl, data = data)
fullModel <- lm(HDL ~ spbl + cholesterol + triglyceride, data = data)

coef(simpleModel)
## (Intercept)       spbl1 
##   43.772727    8.377273
coef(fullModel)
##  (Intercept)        spbl1  cholesterol triglyceride 
##  48.86452734   8.14830890  -0.02731242   0.01503897

Explanation: 8.38 in the old vs 8.15 in the new model. This difference in the coefficients is relatively small, indicating that there is not substantial confounding.

Part 9) Model for different levels of categorical variables

Assume straight line models are appropriate for describing the relationship between HDL and total cholesterol level for the absence and the presence of sinking pre-beta. Write out a single regression model that specifies two separate lines for the relationship between HDL and cholesterol, one for the absence of sinking pre-beta lipoprotein (spbl=0) and one for the presence of sinking pre-beta lipoprotein (spbl=1). (You can omit triglycerides from this model). Run code to fit this model.

Model: \[HDL = \beta_0 + \beta_1 \cdot \text{Cholesterol} + \beta_2 \cdot \text{spbl} + \beta_3 \cdot (\text{Cholesterol} \times \text{spbl})\]

# fit model
part9lm <- lm(HDL ~ cholesterol + spbl + cholesterol:spbl, data = data)
summary(part9lm)
## 
## Call:
## lm(formula = HDL ~ cholesterol + spbl + cholesterol:spbl, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.030  -8.883  -1.791   6.051  23.875 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       52.88315    9.03305   5.854 9.02e-07 ***
## cholesterol       -0.03405    0.03282  -1.038    0.306    
## spbl1             -2.86205   14.50782  -0.197    0.845    
## cholesterol:spbl1  0.04199    0.05292   0.793    0.432    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.958 on 38 degrees of freedom
## Multiple R-squared:  0.1833, Adjusted R-squared:  0.1188 
## F-statistic: 2.843 on 3 and 38 DF,  p-value: 0.05051
## make sure to show the output for the model

Part 10) Interaction vs stratification

Fit separate models for each level of SPBL and compare these results to the results you got in part 9.

# Subset data for spbl = 0 and spbl = 1
data_spbl0 <- subset(data, spbl == 0)
data_spbl1 <- subset(data, spbl == 1)

# Fit separate models
model_spbl0 <- lm(HDL ~ cholesterol, data = data_spbl0)
model_spbl1 <- lm(HDL ~ cholesterol, data = data_spbl1)

# Output the summaries of the models
summary(model_spbl0)
## 
## Call:
## lm(formula = HDL ~ cholesterol, data = data_spbl0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.030  -6.649  -2.355   5.668  16.940 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 52.88315    8.35619   6.329 3.55e-06 ***
## cholesterol -0.03405    0.03036  -1.122    0.275    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.212 on 20 degrees of freedom
## Multiple R-squared:  0.05919,    Adjusted R-squared:  0.01214 
## F-statistic: 1.258 on 1 and 20 DF,  p-value: 0.2753
summary(model_spbl1)
## 
## Call:
## lm(formula = HDL ~ cholesterol, data = data_spbl1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.070 -10.382   2.640   5.888  23.875 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50.021098  12.228549   4.091 0.000687 ***
## cholesterol  0.007941   0.044726   0.178 0.861066    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.73 on 18 degrees of freedom
## Multiple R-squared:  0.001748,   Adjusted R-squared:  -0.05371 
## F-statistic: 0.03152 on 1 and 18 DF,  p-value: 0.8611

When SPBL is Absent (model_spbl0):
Coefficient of cholesterol: -0.03405 (not statistically significant, p = 0.275)

When SPBL is Present (model_spbl1):
Coefficient of cholesterol: 0.007941 (not statistically significant, p = 0.861066)

Part 11) Plotting separate lines

Plot the observed data and the fitted lines from part 9 for each value of sinking pre-beta (either 2 separate graphs with the same y axes or combined in one graph).

data %>%
  ggplot(aes(x = cholesterol, y = HDL, color = spbl)) +
  geom_point() +
  geom_smooth(
    method = "lm",
    aes(fill = spbl)
  )

Describe what you see: Lack of spbl shows a downward trend in HDL as cholesterol increases, while presence of spbl shows a slight upward trend.

Part 12) Testing for equality of lines

Fit the model you need to test for coincidence (equality) of the two lines in part 9. Perform the test and interpret.

equalLines <- lm(HDL ~ cholesterol, data = data)
anova(equalLines, part9lm) %>%
  summary()
##      Res.Df          RSS             Df      Sum of Sq           F        
##  Min.   :38.0   Min.   :3768   Min.   :2   Min.   :799.4   Min.   :4.031  
##  1st Qu.:38.5   1st Qu.:3968   1st Qu.:2   1st Qu.:799.4   1st Qu.:4.031  
##  Median :39.0   Median :4168   Median :2   Median :799.4   Median :4.031  
##  Mean   :39.0   Mean   :4168   Mean   :2   Mean   :799.4   Mean   :4.031  
##  3rd Qu.:39.5   3rd Qu.:4368   3rd Qu.:2   3rd Qu.:799.4   3rd Qu.:4.031  
##  Max.   :40.0   Max.   :4567   Max.   :2   Max.   :799.4   Max.   :4.031  
##                                NA's   :1   NA's   :1       NA's   :1      
##      Pr(>F)       
##  Min.   :0.02585  
##  1st Qu.:0.02585  
##  Median :0.02585  
##  Mean   :0.02585  
##  3rd Qu.:0.02585  
##  Max.   :0.02585  
##  NA's   :1

Interpretation: p = 0.4324 With the p-value calculated, we fail to reject the null hypothesis, suggesting no significant difference in the relationships across the two SPBL levels. We may consider the lines equal.

Part 13) Testing for parallel lines

Fit the model you need to test for parallelism (equal slopes) of the two lines in part 9. Perform the test and interpret.

parallelLines <- lm(HDL ~ cholesterol + spbl, data = data)
anova(parallelLines, part9lm) %>%
  summary()
##      Res.Df           RSS             Df      Sum of Sq           F         
##  Min.   :38.00   Min.   :3768   Min.   :1   Min.   :62.42   Min.   :0.6295  
##  1st Qu.:38.25   1st Qu.:3784   1st Qu.:1   1st Qu.:62.42   1st Qu.:0.6295  
##  Median :38.50   Median :3799   Median :1   Median :62.42   Median :0.6295  
##  Mean   :38.50   Mean   :3799   Mean   :1   Mean   :62.42   Mean   :0.6295  
##  3rd Qu.:38.75   3rd Qu.:3815   3rd Qu.:1   3rd Qu.:62.42   3rd Qu.:0.6295  
##  Max.   :39.00   Max.   :3830   Max.   :1   Max.   :62.42   Max.   :0.6295  
##                                 NA's   :1   NA's   :1       NA's   :1       
##      Pr(>F)      
##  Min.   :0.4324  
##  1st Qu.:0.4324  
##  Median :0.4324  
##  Mean   :0.4324  
##  3rd Qu.:0.4324  
##  Max.   :0.4324  
##  NA's   :1

Interpretation: With p = 0.43, we conclude that the slopes of the lines are not significantly different.

Part 14) Centering variables

Center the cholesterol and triglyceride variables at their mean value and refit the model from part 9. How do the models compare (which coefficients change and which stay the same)? Interpret the intercept in both models.

# center values
data <- data %>%
  mutate(
    cholesterol_centered = cholesterol - mean(cholesterol),
    triglyceride_centered = triglyceride - mean(triglyceride)
  )

centeredModel <- lm(HDL ~ cholesterol_centered + spbl + cholesterol_centered:spbl, data = data)
summary(centeredModel)
## 
## Call:
## lm(formula = HDL ~ cholesterol_centered + spbl + cholesterol_centered:spbl, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.030  -8.883  -1.791   6.051  23.875 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                43.76374    2.12304  20.614  < 2e-16 ***
## cholesterol_centered       -0.03405    0.03282  -1.038  0.30600    
## spbl1                       8.38396    3.07658   2.725  0.00966 ** 
## cholesterol_centered:spbl1  0.04199    0.05292   0.793  0.43245    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.958 on 38 degrees of freedom
## Multiple R-squared:  0.1833, Adjusted R-squared:  0.1188 
## F-statistic: 2.843 on 3 and 38 DF,  p-value: 0.05051

Interpretation of intercept in original model: (52.88) Expected HDL level when cholesterol and triglyceride are at zero. Theoretical and less practical as it doesn’t make sense in the real world.

Interpretation of intercept in new model with centered variables: (43.76) Expected HDL level when cholesterol is at its average value. More realistic and interpretable.

Part 15) Scaling varibales

In this dataset, the units of measurement for cholesterol is mg/dL. Suppose we wish to convert this to mmol/L. To do so, we divide the value of cholesterol by 38.67. That is,

\[x \text{ mg/dL} = x \text{ mg/dL}\frac{1 \text{ mmol/L}}{38.67\text{ mg/dL}} =\frac{x}{38.67}\text{ mmol/L}\]

Apply this transformation and compare the simple linear regression model you fit in part 1 for cholesterol to a new model using the (uncentered) cholesterol value measured in mmol/L. What do you notice about the new regression coefficients?

# Convert cholesterol to mmol/L
data <- data %>%
  mutate(cholesterol_mmolL = cholesterol / 38.67)

model_mmolL <- lm(HDL ~ cholesterol_mmolL, data = data)
summary(model_mmolL)
## 
## Call:
## lm(formula = HDL ~ cholesterol_mmolL, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.649  -8.066  -1.267   7.827  28.189 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        52.4702     7.5806   6.922 2.41e-08 ***
## cholesterol_mmolL  -0.6798     1.0684  -0.636    0.528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.69 on 40 degrees of freedom
## Multiple R-squared:  0.01002,    Adjusted R-squared:  -0.01473 
## F-statistic: 0.4049 on 1 and 40 DF,  p-value: 0.5282
plotlyRegression(model_mmolL, title = "Plot of HDL by Cholesterol (mmol/L)")

Interpretation: The coefficient now represents the change in HDL for a one-unit change in cholesterol measured in mmol/L. However, the adj-R2 and p-value remain the same. The new coefficient has changed, but the shape of the graph remains the same.

Session Information

sessionInfo()
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 25314)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rlang_1.1.3      jtools_2.2.2     kableExtra_1.4.0 ggh4x_0.2.8     
##  [5] gridExtra_2.3    ggthemr_1.1.0    plotly_4.10.4    car_3.1-2       
##  [9] carData_3.0-5    readxl_1.4.3     lubridate_1.9.3  forcats_1.0.0   
## [13] stringr_1.5.1    dplyr_1.1.4      purrr_1.0.2      readr_2.1.5     
## [17] tidyr_1.3.1      tibble_3.2.1     ggplot2_3.4.4    tidyverse_2.0.0 
## [21] knitr_1.45      
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4       xfun_0.41          bslib_0.6.1        htmlwidgets_1.6.4 
##  [5] lattice_0.22-5     tzdb_0.4.0         vctrs_0.6.5        tools_4.3.2       
##  [9] crosstalk_1.2.1    generics_0.1.3     fansi_1.0.6        highr_0.10        
## [13] pkgconfig_2.0.3    Matrix_1.6-5       data.table_1.14.10 lifecycle_1.0.4   
## [17] farver_2.1.1       compiler_4.3.2     munsell_0.5.0      htmltools_0.5.7   
## [21] sass_0.4.8         yaml_2.3.8         lazyeval_0.2.2     pillar_1.9.0      
## [25] crayon_1.5.2       jquerylib_0.1.4    ellipsis_0.3.2     cachem_1.0.8      
## [29] abind_1.4-5        nlme_3.1-164       tidyselect_1.2.0   digest_0.6.34     
## [33] stringi_1.8.3      pander_0.6.5       bookdown_0.37      labeling_0.4.3    
## [37] splines_4.3.2      rmdformats_1.0.4   fastmap_1.1.1      grid_4.3.2        
## [41] colorspace_2.1-0   cli_3.6.2          magrittr_2.0.3     utf8_1.2.4        
## [45] withr_3.0.0        scales_1.3.0       timechange_0.3.0   rmarkdown_2.25    
## [49] httr_1.4.7         cellranger_1.1.0   hms_1.1.3          evaluate_0.23     
## [53] viridisLite_0.4.2  mgcv_1.9-1         Rcpp_1.0.12        glue_1.7.0        
## [57] xml2_1.3.6         svglite_2.1.3      rstudioapi_0.15.0  jsonlite_1.8.8    
## [61] R6_2.5.1           systemfonts_1.0.5